% $Header: /home/grad2/araim1/cvs_root/beamer-example/beamer-example.tex,v 1.1.1.1 2008/03/22 18:58:58 araim1 Exp $

% Adopted from Till Tantau's template that comes packaged with Beamer

\documentclass[compress]{beamer}

\mode<presentation>

% Boadilla is a nice theme, but I don't like the big spherical bullet points. I also
% want a blue bar across the top of every slide

\usetheme{Boadilla}
\setbeamercolor*{frametitle}{parent=palette primary}
\setbeamertemplate{items}[default]
\setbeamertemplate{sections/subsections in toc}[sections numbered]

\setbeamercovered{transparent}

\usepackage{beamerinnerthemeumbcboxes}
\usepackage{listings}
\usepackage[noend]{algorithmic}
\usepackage{algorithm}
\usepackage[english]{babel}

\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{fancyvrb}
\usepackage[noend]{algorithmic}
\usepackage{color}
\usepackage{subfigure}

\newtheorem*{defn1}{Orthogonal Vectors}
\newtheorem*{defn2}{Orthogonal Matrix}
\newtheorem*{defn3}{Diagonal Matrix}
\newtheorem*{defn4}{Singular Value Decomposition}
\newtheorem*{defn5}{Rotation Matrix}


\title{Cell SVD Project Presentation}

\subtitle{CMSC 691A, Spring 2008}

\author[araim1, lukgeo1]{Andrew~M.~Raim, Luke~Georgalas}

\institute[UMBC]
{
Department of Computer Science\\
University of Maryland Baltimore County}

\date{5/13/2008}

\subject{Talks}

\begin{document}

\begin{frame}
\titlepage
\end{frame}

\begin{frame}{Outline}
\tableofcontents
% You might wish to add the option [pausesections]
\end{frame}

\section{Introduction}
\begin{frame}{Objectives}
\begin{itemize}
\item Compute the Singular Value Decomposition (SVD) on large matricies
\item Take advantage of the Cell Broadband Engine (CBE) architecture to compute quickly
\item Start with the \emph{Stream SVD} algorithm presented in Strumpen, and make it suitable for use on Cell
\end{itemize}
\end{frame}

\section{Background}
\begin{frame}[shrink]{Some linear algebra}
% Orthogonal vectors
\begin{defn1}
Two vectors $u, v \in \mathbb{R}^n$ are orthogonal if $u \cdot v = u^T v = u_1 v_1 + \cdots + u_n v_n = 0$
\end{defn1}

Note that $u \cdot u = u_1 u_1 + \cdots + u_n u_n = || u ||^2$. If $u$ is normalized, then $u \cdot u = 1$

% Orthogonal matrix
\begin{defn2}
A matrix $Q$ is orthogonal if all row vectors are pairwise orthogonal, in other words $Q Q^T = Q^T Q = I$ (assuming
that each row has been normalized). In this case, $Q^T = Q^{-1}$
\end{defn2}

The product of two orthogonal matrices $Q_1, Q_2$ is also an orthogonal matrix

\begin{displaymath}
(Q_1 Q_2) (Q_1 Q_2)^T = Q_1 Q_2 Q_2^T Q_1^T = Q_1 I Q_1^T = Q_1 Q_1^T = I.
\end{displaymath}

% Diagonal matrix
\begin{defn3}
An $m \times n$ matrix $M$ is \emph{diagonal} if $M_{ij} = 0$ for all $i \neq j$. The remaining entries may or may not
be non-zero
\end{defn3}
\end{frame}


\begin{frame}[shrink]{SVD}
% SVD
\begin{defn4}
Suppose $A$ is an $m \times n$ matrix, where each entry $a_{ij} \in \mathbb{R}$. The singular value decomposition is

\begin{displaymath}
A = U S V^T
\end{displaymath}

Where:
\begin{itemize}
\item $U$ is an $m \times m$ orthogonal matrix
\item $S$ is an $m \times n$ diagonal matrix, with the last $m - n$ rows containing all zero entries. The diagonal entries
are known as the \emph{singular values}. By convention, the rows of $S$ are ordered by singular value, so that the top row
contains the largest value and so on.
\item $V^T$ is an $n \times n$ orthogonal matrix
\end{itemize}
\end{defn4}

\end{frame}



\begin{frame}[shrink]{LSI}
Latent Semantic Indexing (LSI) is a technique used in field of Information Retrieval to semantically represent the relationship
between terms, documents, and concepts. It is based on the SVD. LSI is based on the term-document matrix - a matrix of $M$ rows (terms)
by $N$ columns (documents), with each entry containing the frequency of the term in the corresponding document. This matrix is
built from a \emph{corpus} of documents. 

\begin{displaymath}
A =  
\underbrace{
\left[
\begin{array}{cccc}
  freq_{1,1} &   freq_{1,2} &  \cdots & freq_{1,n} \\
  freq_{2,1} &   freq_{2,2} &  \cdots & freq_{2,n} \\
  \vdots  &   \vdots &   \ddots & \vdots  \\
  freq_{m,1} &   freq_{m,2} &  \cdots & freq_{m,n} \\
\end{array}
\right] }_{\textrm{(documents)}}
\textrm{(terms)}
\end{displaymath}

\begin{displaymath}
\underbrace{
\left[
\begin{array}{cccc}
  t_{1,1} &   t_{1,2} &  \cdots & t_{1,m} \\
  t_{2,1} &   t_{2,2} &  \cdots & t_{2,m} \\
  \vdots  &   \vdots &   \ddots & \vdots  \\
  t_{m,1} &   t_{m,2} &  \cdots & t_{m,m} \\
\end{array}
\right] }_{T}
%
\underbrace{
\left[
\begin{array}{cccc}
  s_{1,1} &   0 &  \cdots & 0 \\
  0 &   s_{2,2} &  \cdots & 0 \\
  \vdots  &   \vdots &   \ddots & \vdots  \\
  0 &   0 &  \cdots & s_{n,n} \\
  0 &   0 &  \cdots & 0 \\
  \vdots  &   \vdots &   \ddots & \vdots  \\
  0 &   0 &  \cdots & 0 \\
\end{array}
\right] }_{S}
%
\underbrace{
\left[
\begin{array}{cccc}
  d^T_{1,1} &   d^T_{1,2} &  \cdots & d^T_{1,n} \\
  d^T_{2,1} &   d^T_{2,2} &  \cdots & d^T_{2,n} \\
  \vdots  &   \vdots &   \ddots & \vdots  \\
  d^T_{n,1} &   d^T_{n,2} &  \cdots & d^T_{n,n} \\
\end{array}
\right] }_{D^T}
\end{displaymath}

Where:
\begin{itemize}
\item $S$ is the matrix of singular values, which represent semantic concepts
\item $T$ maps sets of terms to these concepts
\item $D^T$ maps from these concepts to a set of documents  
\end{itemize}

\end{frame}


\begin{frame}[shrink]{Reuters Corpus Matrix}
\begin{itemize}
\item Number of terms $M = 54574$
\item Number of documents $N = 19043$
\item Total number of entries in $A_{m,n}$ is $54574 \times 19043 = 991$ MB
\item Total number of non-zero entries is $1215837$, which is only $0.117 \%$ of the total
\item If each entry is represented by a single byte, the full matrix would be $991$ MB but the non-zeroes would be only $1.16$ MB
\item The SVD components (as well as the working copy of $A$) would require floating point elements for storage:
  \begin{itemize}
  \item $A_{m,n}$ : $54574 \times 19043 \times 4 = 3.872$ GB
  \item $U_{m,m}$ : $54574^2 \times 4 = 11.095$ GB
  \item $S[1 : n]$ : $19043 \times 4 = 74.387$ KB, if we store only the diagonal
  \item $V^T_{n,n}$ : $19043^2 \times 4 = 1.351$ GB
  \end{itemize}
\end{itemize}

It is possible to represent the original $A$ matrix sparsely and save a signification amount of space. Unfortunately due to the nature of
our rotations, the working copy of $A$ will quickly have zero entries transformed into non-zeroes.

Storage becomes a limiting factor in this algorithm.

Let's try to achieve good performance for the general purpose SVD calculation on the Cell, and worry about handling these particular
matrices later.
\end{frame}

\section{Explanation of General algorithm}
\begin{frame}[shrink]{Literature Review}
Our project is based primarily off of the work in Strumpen \cite{strumpen}. The authors of this paper describe several classical
SVD algorithms that do not parallelize well, including Golub-Kahan-Reinsch and the classical Jacobi algorithm. They describe
the Hestenes-Jacobi algorithm which is more inclined for parallelization, and base their implementation off of this.
\end{frame}

\begin{frame}[shrink]{Orthogonalization Approach}
The idea behind Hestenes-Jacobi is to transform the row vectors of the original matrix $A$ so that (1) all rows are pairwise orthogonal
to each other, and (2) the norms of the original vectors are preserved.

\begin{defn5}
A matrix operation $Q$ is a \emph{rotation} if it preserves vector length so that $|| Q v || = || v || $. 
\end{defn5}

It turns out that the class of orthogonal matrices is exactly the same as the class of rotation matrices \cite{wiki:rotation}. 

Rotations are iteratively applied to pairs of rows from the original matrix, until sufficient orthogonality has been reached at step $t$. 
Suppose rotation $\Phi^{(i)}$ is applied on the $i$th iteration of the algorithm, to obtain result $A^{(i)}$. We can accumulate these
rotations
into a single rotation

\begin{displaymath}
U^T \gets \Phi^{(t)} \Phi^{(t-1)} \cdots \Phi^{(2)} \Phi^{(1)} I 
\end{displaymath}

Since $U^T$ is a product of orthogonal transformations, it is itself orthogonal [proof?]. Then $U^T = U^{-1}$, and we can use it in the
definition of the SVD
to find the singular value matrix $S$, and the other orthogonal matrix $V^T$
\end{frame}

\begin{frame}[shrink]{Orthogonalization Approach}{Derivation}
\begin{eqnarray*}
A^{(t)} &=& \Phi^{(t)} \Phi^{(t-1)} \cdots \Phi^{(2)} \Phi^{(1)} A \\
&=& U^T A \\
&=& S V^T \\
&=& 
\left[
\begin{array}{cccc}
  s_1     & 0        & \cdots & 0 \\
  0       & s_2      & \cdots & 0 \\
  \vdots  &   \vdots & \ddots & \vdots  \\
  0       & 0        & \cdots & s_n \\
  0       & 0        & \cdots & 0 \\
  \vdots  &   \vdots & \ddots & \vdots  \\
  0       & 0        & \cdots & 0
\end{array}
\right]
\left[
\begin{array}{cccc}
  v^T_{1,1} &   v^T_{1,2} &  \cdots & v^T_{1,n} \\
  v^T_{2,1} &   v^T_{2,2} &  \cdots & v^T_{2,n} \\
  \vdots    &   \vdots    &  \ddots & \vdots  \\
  v^T_{n,1} &   v^T_{n,2} &  \cdots & v^T_{n,n} \\
\end{array}
\right] \\
&=& 
\left[
\begin{array}{cccc}
  s_1 v^T_{1,1} &   s_1 v^T_{1,2} &  \cdots & s_1 v^T_{1,n} \\
  s_2 v^T_{2,1} &   s_2 v^T_{2,2} &  \cdots & s_2 v^T_{2,n} \\
  \vdots        &   \vdots        &  \ddots & \vdots  \\
  s_n v^T_{n,1} &   s_n v^T_{n,2} &  \cdots & s_n v^T_{n,n} \\
\end{array}
\right]. \\
\end{eqnarray*}

Therefore
\begin{equation}
S_{i,i} = || A^{(t)}[i] || 
\quad \textrm{and} \quad
v^T[i] = \frac{ A^{(t)}[i] }{ || A^{(t)}[i] || }
\qquad \textrm{for } i = 1, \cdots, n
\end{equation}
\end{frame}

\begin{frame}[shrink]{Orthogonalization Approach}{Derivation of $U$}
Note that instead of accumulating $U^T$ explicitly, it can also be calculated from the original $A$ and resulting $A^{(t)}$
\begin{eqnarray*}
A^{(t)} &=& U^T A \\
I &=& U^T A \, A^{(t) T} \\
I^T &=& \left( U^T A \, A^{(t) T} \right)^T \\
I &=& A^{(t)} A^T U  \\
U^T &=& A^{(t)} A^T \\
\end{eqnarray*}
\end{frame}


\begin{frame}[shrink]{Vector Rotation}
The rotation angle $\theta$ is chosen based on the angle $\phi$ between the row vectors $A[i]$ and $A[j]$, so that $\phi + \theta =
\frac{\pi}{2}$. 

\begin{figure}
\centering
\subfigure[Acute correction]{
\fbox{\includegraphics[scale=0.5]{graphics/AcuteRotation.png}}
\label{fig:rot_example_acute}
}
\subfigure[Obtuse correction]{
\fbox{\includegraphics[scale=0.5]{graphics/ObtuseRotation.png}}
\label{fig:rot_example_obtuse}
}
\caption{Illustration of rotations to correct acute angle \subref{fig:rot_example_acute} and obtuse angle \subref{fig:rot_example_obtuse} 
while orthogonalizing a pair of row vectors}
\label{fig:rot_example}
\end{figure}
\end{frame}


\begin{frame}[shrink]{Jacobi Rotation}
It can be shown that this transformation is orthogonal, and is therefore a rotation. It can also be shown that it only affects elements
$i$ and $j$
in any vector it is applied to. Or if applied to a data matrix, it will only affect rows $i$ and $j$. This is crucial for parallelizing
the SVD, as we will
see later.

\begin{displaymath}
J^T A =
\left[
\begin{array}{ccccccc}
1 & & & & & & \\
& \ddots & & & & & \\
& & cos\,\theta & & -sin\,\theta & & \\
& & & \ddots & & & \\
& & sin\,\theta & & cos\,\theta & & \\
& & & & & \ddots & \\
& & & & & & 1 \\
\end{array}
\right]^T
\left[
\begin{array}{cc}
A[1] \\
A[2] \\
\vdots \\
A[m] \\
\end{array}
\right]
\end{displaymath}

\begin{eqnarray}
\left[
\begin{array}{cc}
A[i]_{new} \\
A[j]_{new} \\
\end{array}
\right]
&=&
\left[
\begin{array}{cc}
cos\,\theta & -sin\,\theta \\
sin\,\theta & cos\,\theta \\
\end{array}
\right]^T
\left[
\begin{array}{c}
A[i] \\
A[j] \\
\end{array}
\right] \nonumber \\
A[i]_{new} &=& A[i] \cdot cos\,\theta - A[j] \cdot sin\,\theta \\
A[j]_{new} &=& A[i] \cdot sin\,\theta + A[j] \cdot cos\,\theta
\end{eqnarray}

\begin{equation}
A[k]_{new} = A[k] \qquad \textrm{ for } k \neq i \textrm{ and } k \neq j
\end{equation}
\end{frame}


\begin{frame}{Strumpen's RAW Implementation}
\begin{itemize}
\item Assumes an $R x R$ array of processors
\item Data flows between processors in the array, in a systolic manner
\item Processors perform calculations as data passes through
\item We will try a different approach for Cell, to allow the SPEs to work independently
\end{itemize}
\end{frame}


\section{Our Implementation}
\begin{frame}{Creating row blocks}
\begin{small}

Original data matrix $A$ =
\begin{displaymath}
\left[
\begin{array}{cccc}
a_{1,1} & a_{1,2} & \cdots & a_{1,n} \\
a_{2,1} & a_{2,2} & \cdots & a_{2,n} \\
\hdots & \hdots & \ddots & \vdots \\
a_{m,1} & a_{m,2} & \cdots & a_{m,n} \\
\end{array}
\right]
\end{displaymath}

Group the rows of $A$ into blocks of size $R$. Call the resulting blocks $rb_0 \cdots rb_k$. \\

\begin{displaymath}
\begin{array}{|c|}
\hline
rb_0 \textrm{: rows } 1 \textrm{ to } R \\ \hline
rb_1 \textrm{: rows } R+1 \textrm{ to } 2R \\ \hline
\vdots \\ \hline
rb_k \textrm{: rows } kR+1 \textrm{ to } M \\ \hline
\end{array}
\end{displaymath}

\end{small}
\end{frame}


\begin{frame}{The Work Matrix}
\begin{small}

When performing Algorithm1, we must perform a computation on each pair of row blocks. We can create
a matrix to show the work that must be done.

\begin{displaymath}
Work_{k,k} =
\begin{array}{|c|c|c|c|}
\hline
rb_{1,1} & rb_{1,2} & \cdots & rb_{1,k} \\ \hline
rb_{2,1} & rb_{2,2} & \cdots & rb_{2,k} \\ \hline
\hdots & \hdots & \ddots & \vdots \\ \hline
rb_{k,1} & rb_{k,2} & \cdots & rb_{k,k} \\ \hline
\end{array}
\end{displaymath}

Only the upper triangular portion of the matrix must be calculated. Computing $Work(rb_i, rb_j)$ will
modify the rows in both $rb_i$ and $rb_j$.

\end{small}
\end{frame}


\begin{frame}{Parallelization}
Notice that in the course of processing $Work(rb_i, rb_j)$, all rows within those
blocks are modified. Therefore, if we are to parallelize the problem, a give row block
$rb_i$ should have at most one processor operating on it. In terms of the $Work$ matrix,
this means that no two processors can be operating within the same row or column at any
given time. If this constraint is not enforced, the results will be in an indeterminite
state (and will probably not be correct).
\end{frame}


\begin{frame}{Parallelization Idea}
Here is an example of the flow of control on a $Work$ matrix with $k = 4$. Let $t_i$ denote the $i$th time step of the
algorithm.

% First two frames of animation - shows up as upper row
\pause

\begin{displaymath}
\rightarrow
\begin{array}{|c|c|c|c|}
\hline
\color{red} t_1 & & & \\ \hline
& \color{red} t_1 & & \\ \hline
& & \color{red} t_1 & \\ \hline
& & & \color{red} t_1 \\ \hline
\end{array}
\pause
\rightarrow
\begin{array}{|c|c|c|c|}
\hline
t_1 & \color{red} t_2 & & \\ \hline
& t_1 & \color{red} t_2 & \\ \hline
& & t_1 & \color{red} t_2 \\ \hline
& & & t_1 \\ \hline
\end{array}
\end{displaymath}

\pause

% Last two frames of animation - shows up as lower row
\begin{displaymath}
\rightarrow
\begin{array}{|c|c|c|c|}
\hline
t_1 & t_2 & \color{red} t_3 & \\ \hline
& t_1 & t_2 & \color{red} t_3 \\ \hline
& & t_1 & t_2 \\ \hline
& & & t_1 \\ \hline
\end{array}
\pause
\rightarrow
\begin{array}{|c|c|c|c|}
\hline
t_1 & t_2 & t_3 & \color{red} t_4 \\ \hline
& t_1 & t_2 & t_3\\ \hline
& & t_1 & t_2 \\ \hline
& & & t_1 \\ \hline
\end{array}
\end{displaymath}
\end{frame}


\begin{frame}[shrink]{Processing a chunk of work}
\input{workchunk.tex}
\end{frame}

\section{Cell Specifics}
\begin{frame}[shrink]{Mailbox Communications PPE}
\begin{figure}
\centering
\fbox{\includegraphics[width=5in]{graphics/MailProtocol-PPE.png}}
\caption{PPE work control using interprocessor mailbox communication}
\label{fig:ppe_comm}
\end{figure}
\end{frame}

\begin{frame}[shrink]{Mailbox Communications SPE}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{graphics/MailProtocol-SPE.png}}
\caption{SPE work control using interprocessor mailbox communication}
\label{fig:spe_comm}
\end{figure}
\end{frame}

\begin{frame}[shrink]{Memory Access}
The $A$ and $A^{(i)}$ matrices are represented in memory as a contiguous row-major block of $4$ byte floating point numbers. Each SPE is
responsible for reading the appropriate rows into its local store, as they are needed. Rows may be too large to fit into the $256$ KB of
space (also shared with code), so they are brought over in small chunks - $32$ elements or $128$ bytes.This particular size was chosen
to make efficient use of DMAs, since they bring over $128$ byte fragments of memory per operation.

We employ the technique of double buffering to bring data into SPE local store while calculations are occurring. This is used when:
\begin{itemize}
\item reading a $128$ byte row fragment during the calculation phase
\item reading a $128$ byte row fragment during the application phase
\item writing a $128$ byte row fragment during the application phase
\end{itemize}

Double buffering is possible on the Cell because the SPEs do not block when a DMA has been issued, unless the programmer chooses to do so.
Note that after we apply a rotation to a pair of $128$ byte row fragments and issue a DMA to write results back into memory, there is no
need to wait for the DMA to complete before moving to the next pair of 
fragments.
\end{frame}


\begin{frame}[shrink]{Results}{Test II - Iterations}

\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/IterationsN128.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/IterationsN256.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/IterationsN512.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/IterationsN1024.jpg}}
\end{figure}

\end{frame}

\begin{frame}[shrink]{Results}{Test II - Time}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/TimeN128.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/TimeN256.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/TimeN512.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/TimeN1024.jpg}}
\end{figure}
\end{frame}

\begin{frame}[shrink]{Results}{Test III - Accuracy Analysis}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/rms.jpg}}
\end{figure}
\begin{figure}
\centering
\fbox{\includegraphics[width=2.5in]{testresults/R2.jpg}}
\end{figure}
\end{frame}

\begin{frame}[shrink]{References}
\nocite{*}
\bibliographystyle{plain}
\bibliography{project-refs}
\end{frame}

\end{document}



